library(tidyverse) # Data manipulation and visualization
## Warning: package 'tidyverse' was built under R version 4.3.2
## Warning: package 'ggplot2' was built under R version 4.3.2
## Warning: package 'tidyr' was built under R version 4.3.2
## Warning: package 'dplyr' was built under R version 4.3.2
## Warning: package 'stringr' was built under R version 4.3.2
## ── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
## ✔ dplyr 1.1.4 ✔ readr 2.1.4
## ✔ forcats 1.0.0 ✔ stringr 1.5.1
## ✔ ggplot2 3.4.4 ✔ tibble 3.2.1
## ✔ lubridate 1.9.3 ✔ tidyr 1.3.0
## ✔ purrr 1.0.2
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag() masks stats::lag()
## ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors
library(data.table) # Efficient data manipulation
##
## Attaching package: 'data.table'
##
## The following objects are masked from 'package:lubridate':
##
## hour, isoweek, mday, minute, month, quarter, second, wday, week,
## yday, year
##
## The following objects are masked from 'package:dplyr':
##
## between, first, last
##
## The following object is masked from 'package:purrr':
##
## transpose
library(skimr) # Data summary statistics
library(rstudioapi) # RStudio API for interactions with RStudio
library(inspectdf) # DataFrame inspection
library(mice) # Imputation of missing values
##
## Attaching package: 'mice'
##
## The following object is masked from 'package:stats':
##
## filter
##
## The following objects are masked from 'package:base':
##
## cbind, rbind
library(plotly) # Interactive plots
## Warning: package 'plotly' was built under R version 4.3.2
##
## Attaching package: 'plotly'
##
## The following object is masked from 'package:ggplot2':
##
## last_plot
##
## The following object is masked from 'package:stats':
##
## filter
##
## The following object is masked from 'package:graphics':
##
## layout
library(highcharter) # Highcharts for R
## Registered S3 method overwritten by 'quantmod':
## method from
## as.zoo.data.frame zoo
## Highcharts (www.highcharts.com) is a Highsoft software product which is
## not free for commercial and Governmental use
library(recipes) # Preprocessing of data
##
## Attaching package: 'recipes'
##
## The following object is masked from 'package:stringr':
##
## fixed
##
## The following object is masked from 'package:stats':
##
## step
library(caret) # Classification and regression training
## Loading required package: lattice
##
## Attaching package: 'caret'
##
## The following object is masked from 'package:purrr':
##
## lift
library(purrr) # Functional programming
library(graphics) # Base R graphics
library(Hmisc) # Miscellaneous functions
##
## Attaching package: 'Hmisc'
##
## The following object is masked from 'package:plotly':
##
## subplot
##
## The following objects are masked from 'package:dplyr':
##
## src, summarize
##
## The following objects are masked from 'package:base':
##
## format.pval, units
library(glue) # String manipulation
library(h2o) # H2O.ai for machine learning
##
## ----------------------------------------------------------------------
##
## Your next step is to start H2O:
## > h2o.init()
##
## For H2O package documentation, ask for help:
## > ??h2o
##
## After starting H2O, you can use the Web UI at http://localhost:54321
## For more information visit https://docs.h2o.ai
##
## ----------------------------------------------------------------------
##
##
## Attaching package: 'h2o'
##
## The following objects are masked from 'package:data.table':
##
## hour, month, week, year
##
## The following objects are masked from 'package:lubridate':
##
## day, hour, month, week, year
##
## The following objects are masked from 'package:stats':
##
## cor, sd, var
##
## The following objects are masked from 'package:base':
##
## %*%, %in%, &&, ||, apply, as.factor, as.numeric, colnames,
## colnames<-, ifelse, is.character, is.factor, is.numeric, log,
## log10, log1p, log2, round, signif, trunc
# Load the raw data
raw <- fread("crimes.csv")
# Display a glimpse of the data
raw %>% glimpse()
## Rows: 1,994
## Columns: 20
## $ PctEmplProfServ <dbl> 0.41, 0.15, 0.29, 0.45, 0.38, 0.77, 0.53, 0.34, 0.…
## $ PctOccupManu <dbl> 0.25, 0.42, 0.49, 0.37, 0.42, 0.06, 0.33, 0.71, 0.…
## $ PctOccupMgmtProf <dbl> 0.52, 0.36, 0.32, 0.39, 0.46, 0.91, 0.49, 0.18, 0.…
## $ MalePctDivorce <dbl> 0.68, 1.00, 0.63, 0.34, 0.22, 0.49, 0.25, 0.38, 0.…
## $ MalePctNevMarr <dbl> 0.40, 0.63, 0.41, 0.45, 0.27, 0.57, 0.34, 0.47, 0.…
## $ FemalePctDiv <dbl> 0.75, 0.91, 0.71, 0.49, 0.20, 0.61, 0.28, 0.59, 0.…
## $ TotalPctDiv <dbl> 0.75, 1.00, 0.70, 0.44, 0.21, 0.58, 0.28, 0.52, 0.…
## $ PersPerFam <dbl> 0.35, 0.29, 0.45, 0.75, 0.51, 0.44, 0.42, 0.78, 0.…
## $ PctFam2Par <dbl> 0.55, 0.43, 0.42, 0.65, 0.91, 0.62, 0.77, 0.45, 0.…
## $ PctKids2Par <dbl> 0.59, 0.47, 0.44, 0.54, 0.91, 0.69, 0.81, 0.43, 0.…
## $ PctYoungKids2Par <dbl> 0.61, 0.60, 0.43, 0.83, 0.89, 0.87, 0.79, 0.34, 0.…
## $ PctTeen2Par <dbl> 0.56, 0.39, 0.43, 0.65, 0.85, 0.53, 0.74, 0.34, 0.…
## $ PctWorkMomYoungKids <dbl> 0.74, 0.46, 0.71, 0.85, 0.40, 0.30, 0.57, 0.29, 0.…
## $ PctWorkMom <dbl> 0.76, 0.53, 0.67, 0.86, 0.60, 0.43, 0.62, 0.27, 0.…
## $ NumIlleg <dbl> 0.04, 0.00, 0.01, 0.03, 0.00, 0.00, 0.00, 0.02, 0.…
## $ PctIlleg <dbl> 0.14, 0.24, 0.46, 0.33, 0.06, 0.11, 0.13, 0.50, 0.…
## $ NumImmig <dbl> 0.03, 0.01, 0.00, 0.02, 0.00, 0.04, 0.01, 0.02, 0.…
## $ PctImmigRecent <dbl> 0.24, 0.52, 0.07, 0.11, 0.03, 0.30, 0.00, 0.50, 0.…
## $ PctImmigRec5 <dbl> 0.27, 0.62, 0.06, 0.20, 0.07, 0.35, 0.02, 0.59, 0.…
## $ ViolentCrimesPerPop <dbl> 0.20, 0.67, 0.43, 0.12, 0.03, 0.14, 0.03, 0.55, 0.…
# Check for missing values
raw %>% inspect_na()
## # A tibble: 20 × 3
## col_name cnt pcnt
## <chr> <int> <dbl>
## 1 PctEmplProfServ 0 0
## 2 PctOccupManu 0 0
## 3 PctOccupMgmtProf 0 0
## 4 MalePctDivorce 0 0
## 5 MalePctNevMarr 0 0
## 6 FemalePctDiv 0 0
## 7 TotalPctDiv 0 0
## 8 PersPerFam 0 0
## 9 PctFam2Par 0 0
## 10 PctKids2Par 0 0
## 11 PctYoungKids2Par 0 0
## 12 PctTeen2Par 0 0
## 13 PctWorkMomYoungKids 0 0
## 14 PctWorkMom 0 0
## 15 NumIlleg 0 0
## 16 PctIlleg 0 0
## 17 NumImmig 0 0
## 18 PctImmigRecent 0 0
## 19 PctImmigRec5 0 0
## 20 ViolentCrimesPerPop 0 0
# View the entire dataset
View(raw)
# Extract numeric variable names
num_vars <- raw %>% select_if(is.numeric) %>% names()
# Display outliers for numeric variables
for (b in num_vars) {
OutVals <- boxplot(raw[[b]])$out
if (length(OutVals) > 0) {
print(paste0("----", b))
print(OutVals)
}
}
## [1] "----PctEmplProfServ"
## [1] 1.00 1.00 0.85 1.00 1.00 1.00 1.00 1.00 1.00 1.00 0.93 0.87 1.00 1.00 0.85
## [16] 0.89 1.00 0.94 1.00 0.89 1.00 1.00 1.00 0.88 0.90 1.00 1.00 1.00 0.88 1.00
## [31] 0.97 1.00 0.87 0.85 1.00 0.93 1.00 1.00 0.95 0.97 1.00 1.00 0.86 0.98 1.00
## [46] 0.90 0.94 0.93 1.00 0.00 0.99 0.88 0.97 1.00 0.96 1.00 0.93 0.86 0.96 1.00
## [61] 1.00 0.89 1.00 0.90 0.90 0.90 1.00 1.00
## [1] "----PctOccupManu"
## [1] 1.00 0.99 0.99 1.00 1.00 1.00 1.00 1.00 1.00 1.00 1.00 1.00 1.00 1.00 1.00
## [16] 1.00 0.97 0.92 1.00 1.00 0.99 1.00 1.00 1.00 0.96 1.00 0.93
## [1] "----PctOccupMgmtProf"
## [1] 0.91 0.96 0.98 0.99 1.00 0.93 0.90 0.96 0.92 0.95 1.00 1.00 1.00 0.96 1.00
## [16] 1.00 0.97 1.00 0.94 0.90 0.91 0.99 1.00 1.00 0.93 0.91 0.90 0.89 0.92 0.91
## [31] 0.93 0.96 1.00 0.95 0.98 1.00 1.00 1.00 0.92 0.90 0.98 1.00 0.93 0.91 1.00
## [46] 0.96 0.94 1.00 1.00 1.00 1.00 0.99 1.00 0.91 1.00 0.91 1.00 0.98 0.90 1.00
## [61] 1.00
## [1] "----MalePctDivorce"
## [1] 1.00 1.00 1.00 0.98 1.00
## [1] "----MalePctNevMarr"
## [1] 0.85 0.94 1.00 1.00 1.00 1.00 0.93 1.00 1.00 1.00 0.90 0.98 1.00 0.81 0.88
## [16] 1.00 1.00 0.86 0.86 0.96 1.00 0.89 0.80 0.85 0.95 1.00 0.84 1.00 0.80 1.00
## [31] 0.83 0.96 1.00 1.00 1.00 1.00 1.00 0.79 1.00 0.83 0.96 1.00 0.85 1.00 1.00
## [46] 1.00 0.97 1.00 0.88 1.00 0.81 0.79 0.84 1.00 0.84 0.92 1.00 0.91 0.83 0.02
## [61] 1.00 1.00 0.80 0.81 0.97 1.00 0.87 0.79 0.79 0.91 0.96 0.97 0.91 0.93 0.92
## [76] 1.00 1.00 0.79 0.94 1.00 0.99 0.95 0.83 0.90 0.84 1.00 1.00 0.86 1.00 0.00
## [91] 1.00 0.84 0.80 1.00 1.00 1.00 1.00 1.00 1.00 1.00 1.00 0.84 1.00 0.81 0.82
## [106] 0.85 0.87
## [1] "----PersPerFam"
## [1] 0.85 1.00 0.14 0.83 0.90 0.82 0.97 1.00 0.97 1.00 1.00 0.85 0.97 0.93 1.00
## [16] 1.00 0.83 0.84 0.85 1.00 0.01 0.90 0.88 0.92 1.00 1.00 1.00 0.92 1.00 1.00
## [31] 0.95 0.88 0.85 1.00 1.00 1.00 1.00 1.00 0.84 1.00 1.00 0.90 0.82 0.10 0.90
## [46] 0.06 0.11 0.87 0.81 0.86 1.00 0.14 0.98 0.08 0.90 1.00 0.96 0.84 0.06 1.00
## [61] 0.00 0.81 1.00 1.00 0.12 0.86 0.13 1.00 0.14 0.90 1.00 0.85 1.00 0.96 0.85
## [76] 1.00 0.90 0.84 0.82 0.87 1.00 0.81 0.09 1.00 0.15 0.88 1.00 1.00 1.00 0.14
## [91] 0.81 0.96 0.84 1.00 1.00 0.85 0.84 0.00 1.00 0.89 0.90 0.83 1.00 0.12 1.00
## [106] 1.00 0.95 0.85 0.84 0.00 1.00 0.00 1.00 0.10 1.00 0.91 0.85 0.88 0.06 0.88
## [121] 0.86
## [1] "----PctFam2Par"
## [1] 0.05 0.04 0.00 0.06 0.01 0.08 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.02 0.03
## [16] 0.00 0.01 0.08 0.00 0.00 0.00 0.06 0.00 0.06 0.07 0.08 0.05 0.00 0.06
## [1] "----PctKids2Par"
## [1] 0.05 0.05 0.00 0.02 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.03 0.00
## [16] 0.00 0.00 0.00 0.00 0.05 0.05 0.03 0.00 0.03
## [1] "----PctYoungKids2Par"
## [1] 0.00 0.06 0.03 0.00 0.00 0.00 0.00 0.00 0.06 0.00 0.02 0.00 0.02 0.00 0.00
## [16] 0.00 0.02 0.05 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.05 0.06
## [1] "----PctTeen2Par"
## [1] 0.10 0.05 0.00 0.09 0.09 0.00 0.00 0.03 0.11 0.00 0.10 0.00 0.04 0.10 0.10
## [16] 0.00 0.00 0.00 0.06 0.00 0.06 0.05 0.02 0.00 0.00 0.00 0.02 0.10 0.00 0.07
## [31] 0.00 0.05 0.10 0.04 0.00 0.08 0.02 0.00 0.05 0.00 0.03 0.03 0.06 0.00 0.10
## [46] 0.05 0.05 0.00 0.00 0.00
## [1] "----PctWorkMomYoungKids"
## [1] 0.00 0.00 0.03 0.03 0.97 1.00 1.00 0.98 0.02 0.01 0.00 0.00 0.02 0.00
## [1] "----PctWorkMom"
## [1] 0.06 1.00 0.06 0.00 0.00 0.00 0.00 0.05 0.05 0.03 0.03 0.07 0.00 0.00 0.00
## [16] 0.06 0.05 1.00 0.03 0.00 0.00 0.00 0.01 0.00 0.00 0.02 0.00 0.00 1.00
## [1] "----NumIlleg"
## [1] 0.13 0.08 0.49 0.12 0.07 0.09 0.14 0.09 0.06 0.06 0.07 0.10 0.19 0.06 0.07
## [16] 0.29 0.20 0.06 0.10 0.37 0.09 0.12 0.09 0.41 0.17 0.24 0.07 0.11 0.30 0.17
## [31] 0.09 0.42 0.06 0.08 0.43 0.26 0.11 0.47 0.09 0.08 0.13 0.27 0.45 0.12 0.11
## [46] 0.12 0.16 0.06 0.09 1.00 0.10 0.08 1.00 0.07 0.34 0.08 0.11 0.07 0.14 0.08
## [61] 0.07 0.09 0.28 0.06 0.23 0.13 0.06 0.06 0.67 0.24 0.06 0.37 0.07 0.06 0.10
## [76] 0.19 0.06 1.00 0.13 0.16 0.16 1.00 0.32 0.23 0.11 0.49 0.16 0.37 1.00 0.07
## [91] 0.93 0.11 0.16 0.20 1.00 0.10 0.15 0.11 1.00 0.07 0.07 0.22 0.21 0.10 0.22
## [106] 0.06 0.06 0.08 0.10 0.06 0.10 0.06 0.06 0.09 0.07 0.06 0.06 0.08 0.06 0.07
## [121] 0.55 0.74 0.06 0.11 0.18 0.10 0.11 1.00 0.09 0.23 0.06 0.11 0.18 0.36 0.37
## [136] 0.11 1.00 0.07 0.06 0.09 0.47 0.09 0.06 0.59 0.11 0.08 0.78 0.09 0.14 0.61
## [151] 0.09 0.06 0.11 0.28 0.06 0.12 0.06 0.27 0.15 0.49 0.06 1.00 0.07 0.39 0.09
## [166] 0.22 0.72 0.07 0.18 0.06 0.41 0.06 0.38 0.08 0.20 0.55 0.29 0.10 0.12 1.00
## [181] 0.18 0.12 0.06 0.08 0.06 0.27 0.06 0.39 0.08 0.47 0.09 0.09 0.06 0.10 0.13
## [196] 0.07 0.43 0.07 0.08 0.62 0.29 0.06 0.09 0.06 0.07 0.09 0.07 0.20 0.08 0.07
## [211] 0.06 0.13 0.35 0.06 0.07 0.07 0.17 0.12 0.06 0.36 0.07 0.13 0.08 0.09 0.06
## [226] 0.08 0.08 0.12 0.34 0.34 0.97 0.09 0.11 0.22 0.14 0.13 0.06 0.45 0.15 0.16
## [241] 0.72 0.19 0.11 0.77 0.47 0.13 0.11
## [1] "----PctIlleg"
## [1] 0.73 0.69 0.67 0.97 0.85 0.77 0.81 0.97 0.77 0.90 0.74 0.97 0.87 0.73 1.00
## [16] 0.75 0.91 0.70 1.00 0.91 1.00 0.73 0.72 0.73 0.98 0.79 0.91 1.00 0.74 1.00
## [31] 0.78 1.00 0.84 0.74 1.00 1.00 0.67 1.00 0.76 0.86 1.00 0.68 0.85 0.75 1.00
## [46] 1.00 1.00 1.00 1.00 0.91 0.68 0.69 0.67 0.94 0.86 0.99 0.74 1.00 0.85 0.68
## [61] 0.78 1.00 0.70 0.69 0.69 0.80 0.75 1.00 0.83 0.72 1.00 0.96 0.79 0.67 0.77
## [76] 1.00 0.69 0.99 1.00 1.00 0.75 0.70 0.84 0.76 0.88 0.79 0.78 1.00 1.00 0.73
## [91] 1.00 0.87 0.68 1.00 0.96 1.00 0.84 0.72 0.81 1.00 1.00 0.84 1.00 0.69 0.81
## [106] 0.74 1.00 0.87 0.87 0.70 0.72 1.00 0.75 0.94 1.00 0.75 0.72 0.84 0.94 1.00
## [121] 0.83 0.73 0.91 0.78 0.76 0.69 0.74 0.85 0.97 0.79 1.00 0.68 0.72 0.80 0.72
## [136] 0.67 0.78 0.70 0.74 1.00 1.00 0.69 1.00 1.00 0.81 1.00 1.00 0.74 0.85 0.83
## [151] 1.00 0.71 0.78 0.82 0.78 0.86 0.85
## [1] "----NumImmig"
## [1] 0.28 0.14 0.16 0.06 0.10 0.23 0.11 0.07 0.08 0.25 0.14 0.29 0.14 0.21 0.15
## [16] 0.07 0.09 0.11 0.14 0.08 0.08 0.38 0.06 0.20 0.09 0.07 0.06 0.34 0.08 0.10
## [31] 0.06 0.70 0.07 0.06 0.11 0.45 0.27 0.07 0.10 0.19 0.20 0.16 0.13 0.34 0.58
## [46] 0.08 0.21 0.27 0.07 0.17 0.07 0.07 0.16 0.14 0.12 0.06 0.21 0.15 0.13 0.08
## [61] 1.00 0.06 0.09 0.09 0.07 1.00 0.07 0.06 0.07 0.07 0.14 0.06 0.09 0.07 0.27
## [76] 0.09 0.13 0.77 0.06 0.70 0.22 0.06 0.13 0.84 0.09 0.15 0.16 0.16 0.06 0.12
## [91] 0.06 0.26 0.14 0.06 0.06 0.09 0.10 0.11 0.07 0.19 0.13 0.06 0.08 0.16 0.08
## [106] 0.06 0.16 0.08 0.07 0.07 0.07 0.08 0.14 0.21 0.12 0.18 0.08 0.18 0.81 0.07
## [121] 0.39 0.07 0.12 0.14 0.20 1.00 0.07 0.09 0.08 0.59 0.23 0.10 0.16 1.00 0.06
## [136] 0.08 0.14 0.49 0.13 0.28 1.00 0.10 0.22 0.06 0.09 0.11 0.07 0.10 0.09 0.14
## [151] 0.14 0.41 0.15 0.07 0.57 0.06 0.18 0.11 0.17 0.32 0.13 0.10 0.06 0.06 0.11
## [166] 0.07 0.06 0.32 0.10 0.10 0.13 0.08 0.09 0.30 0.06 0.15 0.15 0.09 0.07 0.07
## [181] 0.06 0.29 0.18 0.15 0.16 0.08 0.09 0.09 0.12 0.08 0.23 0.08 0.09 0.10 0.14
## [196] 0.20 0.07 0.13 0.13 0.17 0.06 0.20 0.11 0.08 0.14 0.15 0.14 0.54 0.06 0.09
## [211] 0.12 0.06 0.12 0.07 0.18 0.24 0.08 0.16 0.24 0.13 0.34 0.08 0.07 0.10 0.06
## [226] 0.07 0.06 0.28 0.06 0.13 0.07 0.09 0.06 0.07 0.15 0.08 1.00 0.09 0.10 0.09
## [241] 0.11 1.00 0.06 0.06 0.20
## [1] "----PctImmigRecent"
## [1] 1.00 1.00 0.93 1.00 0.94 1.00 1.00 0.87 1.00 1.00 1.00 0.96 1.00 1.00 0.91
## [16] 0.87 1.00 1.00 0.85 0.94 0.90 0.88 0.98 0.95 1.00 0.90 1.00 1.00 0.84 1.00
## [31] 1.00 0.93 0.98 0.95 1.00 0.84 1.00 1.00 1.00 1.00 0.96 0.95 0.90 1.00 1.00
## [46] 1.00 1.00 0.94 1.00 1.00 1.00 1.00 1.00 0.88 0.99 0.87 1.00 1.00 0.96 1.00
## [61] 0.89 1.00 0.94 0.92 1.00 1.00 0.89 1.00 1.00 0.96 0.93
## [1] "----PctImmigRec5"
## [1] 0.91 0.90 1.00 0.94 1.00 0.97 0.95 1.00 0.98 0.94 0.91 1.00 1.00 0.93 1.00
## [16] 0.93 0.90 1.00 0.94 1.00 1.00 1.00 0.90 1.00 0.92 0.90 1.00 0.97 0.96 1.00
## [31] 0.97 1.00 0.95 1.00 0.90 0.98 1.00 1.00 1.00 1.00 1.00 0.91 0.99 0.91 1.00
## [46] 1.00 0.93 1.00 1.00 0.94
## [1] "----ViolentCrimesPerPop"
## [1] 0.84 0.80 0.75 1.00 0.86 1.00 1.00 0.86 0.87 0.73 0.73 0.83 0.74 0.88 1.00
## [16] 0.74 0.93 1.00 1.00 0.83 0.80 1.00 1.00 1.00 1.00 0.76 1.00 1.00 1.00 0.78
## [31] 1.00 0.91 0.86 0.85 1.00 0.87 0.87 0.82 0.89 0.85 0.94 1.00 0.81 0.95 0.81
## [46] 1.00 0.82 0.90 1.00 1.00 1.00 0.79 1.00 0.97 0.85 0.76 1.00 0.74 0.73 1.00
## [61] 1.00 0.77 1.00 1.00 1.00 1.00 1.00 1.00 1.00 1.00 1.00 1.00 0.82 1.00 0.76
## [76] 0.84 0.97 1.00 0.96 0.90 1.00 0.83 0.82 1.00 0.86 0.81 0.76 0.93 0.75 0.78
## [91] 1.00 0.87 0.91 0.81 0.80 0.90 0.79 0.75 1.00 1.00 1.00 0.75 0.83 0.81 0.86
## [106] 1.00 0.95 0.88 1.00 0.75
# Define columns to process
columns_to_process <- c(
"PctEmplProfServ", "PctOccupManu", "PctOccupMgmtProf", "MalePctDivorce",
"MalePctNevMarr", "PctFam2Par", "PersPerFam", "PctKids2Par",
"PctYoungKids2Par", "PctTeen2Par", "PctWorkMomYoungKids", "PctWorkMom",
"NumIlleg", "PctIlleg", "PctImmigRecent", "ViolentCrimesPerPop"
)
# Create a copy of the raw data
raw_no_outliers <- raw
# Handle outliers using IQR method
for (col_name in columns_to_process) {
OutVals <- boxplot(raw[[col_name]])$out
median <- median(raw[[col_name]])
o3 <- ifelse(OutVals > median, OutVals, NA) %>% na.omit() %>% as.matrix() %>% t() %>% .[1,]
o1 <- ifelse(OutVals < median, OutVals, NA) %>% na.omit() %>% as.matrix() %>% t() %>% .[1,]
val <- quantile(raw[[col_name]], 0.75) + 1.5 * IQR(raw[[col_name]])
raw[which(raw[[col_name]] %in% o3), col_name] <- val
val <- quantile(raw[[col_name]], 0.25) - 1.5 * IQR(raw[[col_name]])
raw[which(raw[[col_name]] %in% o1), col_name] <- val
}
# Combine the original and processed data
data <- rbind(raw, raw_no_outliers)
# Specify target and feature variables
target <- "ViolentCrimesPerPop"
features <- data %>% select(-ViolentCrimesPerPop) %>% names()
# Create a formula for the model
f <- as.formula(paste(target, paste(features, collapse=" + "), sep=" ~ "))
# Fit a linear regression model using base R
glm <- glm(f, data = data)
# Display summary statistics of the model
glm %>% summary()
##
## Call:
## glm(formula = f, data = data)
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 0.41675 0.04160 10.019 < 2e-16 ***
## PctEmplProfServ -0.03087 0.01968 -1.569 0.116831
## PctOccupManu -0.13375 0.02018 -6.628 3.87e-11 ***
## PctOccupMgmtProf 0.01350 0.02563 0.527 0.598433
## MalePctDivorce 0.75811 0.15309 4.952 7.65e-07 ***
## MalePctNevMarr -0.07459 0.02004 -3.722 0.000200 ***
## FemalePctDiv 0.78018 0.19084 4.088 4.44e-05 ***
## TotalPctDiv -1.31599 0.32603 -4.036 5.53e-05 ***
## PersPerFam 0.11925 0.02215 5.383 7.74e-08 ***
## PctFam2Par 0.31625 0.08795 3.596 0.000328 ***
## PctKids2Par -0.84341 0.07381 -11.427 < 2e-16 ***
## PctYoungKids2Par 0.06529 0.03071 2.126 0.033545 *
## PctTeen2Par 0.01765 0.02899 0.609 0.542695
## PctWorkMomYoungKids 0.09316 0.03037 3.067 0.002175 **
## PctWorkMom -0.16854 0.03017 -5.586 2.48e-08 ***
## NumIlleg 0.14816 0.03443 4.304 1.72e-05 ***
## PctIlleg 0.32116 0.02675 12.007 < 2e-16 ***
## NumImmig 0.16794 0.02980 5.635 1.87e-08 ***
## PctImmigRecent 0.02044 0.02303 0.888 0.374838
## PctImmigRec5 -0.01042 0.02366 -0.440 0.659700
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for gaussian family taken to be 0.01741977)
##
## Null deviance: 192.668 on 3987 degrees of freedom
## Residual deviance: 69.122 on 3968 degrees of freedom
## AIC: -4812.6
##
## Number of Fisher Scoring iterations: 2
# Check and handle multicollinearity using VIF
library(faraway)
##
## Attaching package: 'faraway'
## The following object is masked from 'package:lattice':
##
## melanoma
## The following object is masked from 'package:mice':
##
## mammalsleep
while(glm %>% faraway::vif() %>% sort(decreasing=T) %>% .[1] >= 1.5) {
afterVIF <- glm %>% faraway::vif() %>% sort(decreasing=T) %>% .[-1] %>% names()
f <- as.formula(paste(target, paste(afterVIF, collapse=" + "), sep=" ~ "))
glm <- glm(f, data = data)
}
# Initialize H2O
h2o.init()
## Connection successful!
##
## R is connected to the H2O cluster:
## H2O cluster uptime: 17 minutes 10 seconds
## H2O cluster timezone: Asia/Baku
## H2O data parsing timezone: UTC
## H2O cluster version: 3.42.0.2
## H2O cluster version age: 5 months and 3 days
## H2O cluster name: H2O_started_from_R_ACER_osn291
## H2O cluster total nodes: 1
## H2O cluster total memory: 0.93 GB
## H2O cluster total cores: 4
## H2O cluster allowed cores: 4
## H2O cluster healthy: TRUE
## H2O Connection ip: localhost
## H2O Connection port: 54321
## H2O Connection proxy: NA
## H2O Internal Security: FALSE
## R Version: R version 4.3.1 (2023-06-16 ucrt)
## Warning in h2o.clusterInfo():
## Your H2O cluster version is (5 months and 3 days) old. There may be a newer version available.
## Please download and install the latest version from: https://h2o-release.s3.amazonaws.com/h2o/latest_stable.html
# Convert the data to H2O format
h2o_data <- data %>% as.h2o()
##
|
| | 0%
|
|======================================================================| 100%
# Split the data into training and testing sets
h2o_data <- h2o_data %>% h2o.splitFrame(seed = 123, ratios = 0.7)
train <- h2o_data[[1]]
test <- h2o_data[[2]]
# Fit a generalized linear model using H2O.ai
model <- h2o.glm(
x = features, y = target,
training_frame = train,
validation_frame = test,
seed = 123, nfolds = 10,
lambda = 0, compute_p_values = TRUE
)
##
|
| | 0%
|
|================================================================= | 93%
|
|======================================================================| 100%
# Display coefficients and p-values
model@model$coefficients_table %>%
as.data.frame() %>%
dplyr::select(names, p_value) %>%
mutate(p_value = round(p_value, 3)) %>%
.[-1,] %>%
arrange(desc(p_value))
## names p_value
## 1 PctImmigRec5 0.686
## 2 PctYoungKids2Par 0.345
## 3 PctEmplProfServ 0.239
## 4 PctOccupMgmtProf 0.236
## 5 PctTeen2Par 0.224
## 6 PctWorkMomYoungKids 0.221
## 7 PctImmigRecent 0.143
## 8 PctWorkMom 0.002
## 9 FemalePctDiv 0.001
## 10 TotalPctDiv 0.001
## 11 PctFam2Par 0.001
## 12 PctOccupManu 0.000
## 13 MalePctDivorce 0.000
## 14 MalePctNevMarr 0.000
## 15 PersPerFam 0.000
## 16 PctKids2Par 0.000
## 17 NumIlleg 0.000
## 18 PctIlleg 0.000
## 19 NumImmig 0.000
# Remove features with high p-values
while (model@model$coefficients_table %>%
as.data.frame() %>%
dplyr::select(names, p_value) %>%
mutate(p_value = round(p_value, 3)) %>%
arrange(desc(p_value)) %>%
.[1, 2] > 0.05) {
model@model$coefficients_table %>%
as.data.frame() %>%
dplyr::select(names, p_value) %>%
mutate(p_value = round(p_value, 3)) %>%
filter(!is.nan(p_value)) %>%
.[-1,] %>%
arrange(desc(p_value)) %>%
.[1, 1] -> v
features <- features[features != v]
train <- train %>% as.data.frame() %>% select(target, features) %>% as.h2o()
test <- test %>% as.data.frame() %>% select(target, features) %>% as.h2o()
model <- h2o.glm(
x = features, y = target,
training_frame = train,
validation_frame = test,
nfolds = 10, seed = 123,
lambda = 0, compute_p_values = TRUE
)
}
## Warning: Using an external vector in selections was deprecated in tidyselect 1.1.0.
## ℹ Please use `all_of()` or `any_of()` instead.
## # Was:
## data %>% select(target)
##
## # Now:
## data %>% select(all_of(target))
##
## See <https://tidyselect.r-lib.org/reference/faq-external-vector.html>.
## This warning is displayed once every 8 hours.
## Call `lifecycle::last_lifecycle_warnings()` to see where this warning was
## generated.
## Warning: Using an external vector in selections was deprecated in tidyselect 1.1.0.
## ℹ Please use `all_of()` or `any_of()` instead.
## # Was:
## data %>% select(features)
##
## # Now:
## data %>% select(all_of(features))
##
## See <https://tidyselect.r-lib.org/reference/faq-external-vector.html>.
## This warning is displayed once every 8 hours.
## Call `lifecycle::last_lifecycle_warnings()` to see where this warning was
## generated.
##
|
| | 0%
|
|======================================================================| 100%
##
|
| | 0%
|
|======================================================================| 100%
##
|
| | 0%
|
|======================================================================| 100%
##
|
| | 0%
|
|======================================================================| 100%
##
|
| | 0%
|
|======================================================================| 100%
##
|
| | 0%
|
|======================================================================| 100%
##
|
| | 0%
|
|======================================================================| 100%
##
|
| | 0%
|
|======================================================================| 100%
##
|
| | 0%
|
|======================================================================| 100%
##
|
| | 0%
|
|======================================================================| 100%
##
|
| | 0%
|
|======================================================================| 100%
##
|
| | 0%
|
|======================================================================| 100%
##
|
| | 0%
|
|======================================================================| 100%
##
|
| | 0%
|
|======================================================================| 100%
##
|
| | 0%
|
|======================================================================| 100%
##
|
| | 0%
|
|======================================================================| 100%
##
|
| | 0%
|
|======================================================================| 100%
##
|
| | 0%
|
|======================================================================| 100%
# Display final coefficients and p-values
model@model$coefficients_table %>%
as.data.frame() %>%
dplyr::select(names, p_value) %>%
mutate(p_value = round(p_value, 3))
## names p_value
## 1 Intercept 0.000
## 2 PctOccupManu 0.000
## 3 MalePctDivorce 0.000
## 4 MalePctNevMarr 0.000
## 5 FemalePctDiv 0.001
## 6 TotalPctDiv 0.001
## 7 PersPerFam 0.000
## 8 PctFam2Par 0.000
## 9 PctKids2Par 0.000
## 10 PctWorkMom 0.000
## 11 NumIlleg 0.001
## 12 PctIlleg 0.000
## 13 NumImmig 0.000
## 14 PctImmigRecent 0.023
# Predict on the test set
# Predict on the test data using the trained model
y_pred <- model %>% h2o.predict(newdata = test) %>% as.data.frame()
##
|
| | 0%
|
|======================================================================| 100%
# Extract the predicted values
y_pred$predict
## [1] 0.3276683542 0.2357919834 0.0534537913 0.3494565171 0.2311271800
## [6] 0.1663099875 0.4968704404 0.1105342632 0.3431871104 0.0699129828
## [11] 0.2543534999 0.1522864310 0.2265534135 0.1568405943 0.2370203774
## [16] 0.1717416861 0.3093523825 0.0943840172 0.0380896333 0.3137111595
## [21] 0.3735478821 0.4210084681 0.1790765208 0.2727645779 0.0113056871
## [26] 0.1020025277 0.0941117165 0.2059295812 0.0341031059 0.0714439365
## [31] 0.1246059133 0.0687818392 0.0020445651 0.5547666139 0.0369303978
## [36] 0.1412200806 0.2981599461 0.0781168390 0.0944174435 0.3141790815
## [41] 0.0614555998 0.5755999923 0.0761410282 0.0963628881 0.0638141011
## [46] 0.0551363678 0.5520727653 0.1046855850 0.1466760871 0.0837259969
## [51] 0.2369082059 0.0692056995 0.2336953908 0.0299392204 0.2254516582
## [56] 0.1381375736 0.3478849005 0.2876897546 0.1311932085 0.3368332979
## [61] 0.0547026835 0.2675594967 0.1728135743 0.3023737963 0.3090218634
## [66] 0.0539553052 0.3892380906 0.4583553896 0.0570486367 0.1400643142
## [71] 0.0229954964 0.0470845004 0.2202217102 0.4114349995 0.1673957317
## [76] 0.2747396653 0.2180876111 0.2604723952 0.2532287086 0.5549083013
## [81] 0.3705625842 0.0388835193 0.1356754448 0.2137891553 0.0079981088
## [86] 0.0858347013 0.3153278757 0.1462209026 0.1354191597 0.3351150322
## [91] 0.4704790036 0.0650049035 0.3551666480 0.1315347526 0.3969806842
## [96] 0.0977206838 0.3876120435 0.3359113267 0.2103092377 0.1794680227
## [101] 0.2924862601 0.3482652429 0.3473434877 0.2996989158 0.1324205085
## [106] 0.1901589285 0.0419287557 0.0148407050 0.0883544721 0.1385576219
## [111] 0.0913877109 0.1665406677 0.4011130941 0.2224591260 0.0513132963
## [116] 0.0675738477 0.0813112195 0.0304761396 0.0640742400 0.1543668840
## [121] 0.2884739374 0.1300724883 0.0657865549 0.3110713403 0.5133754027
## [126] 0.1455118664 0.4875738379 0.2735926883 0.0584550837 0.1262153131
## [131] 0.3516521412 0.4850977849 0.4521758390 0.3442535514 0.6757147397
## [136] 0.0444127106 0.0677609280 0.3294417066 0.5120019660 0.1433780575
## [141] 0.1044507622 0.2103058240 0.0977651488 0.1001903777 0.5570822409
## [146] 0.1697696525 0.0747019851 0.4752714882 0.6399772549 0.1530577168
## [151] 0.3253144310 0.2497426563 0.0296503574 0.3988087778 0.3069890281
## [156] 0.3159295879 0.1749361760 0.0524786024 0.3099293263 0.0122147599
## [161] 0.5880439710 0.0681995221 0.4072589825 0.2319517954 0.4863278870
## [166] 0.3392150902 0.2595194494 0.5889684794 0.2200985943 0.3447206292
## [171] 0.1150339769 0.0777016125 0.1241559956 0.2600617556 0.2200678684
## [176] 0.2158727729 0.0814623902 0.1075625724 0.1203224189 0.2348388199
## [181] 0.0651978778 0.0789719258 0.1719476584 0.0188340895 0.3815307602
## [186] 0.2503108053 0.4513658132 0.5421933530 0.3308912793 0.1522005217
## [191] 0.0227095544 0.1834173575 0.2466782480 0.3365900681 0.2133197269
## [196] 0.5204893140 0.1099211078 0.0842297078 0.0379865908 0.0957923767
## [201] 0.2544125646 0.1264668125 0.3203180269 0.0294087084 -0.0028648480
## [206] 0.0185609189 0.0324773834 0.1587637375 0.0947755550 0.3390141137
## [211] 0.2703703891 0.2965136942 0.6471980832 0.4202904297 0.4578554628
## [216] 0.4927880290 0.3041244422 0.1821949766 0.4293715308 0.0186976418
## [221] 0.2572416822 0.0587745040 0.2977843403 0.0825300133 0.0194179674
## [226] 0.2463686388 0.4774115493 0.5826297422 0.1500879937 0.1224696925
## [231] 0.2192468046 0.4714911006 0.2181169294 0.0523225690 0.1612151807
## [236] 0.0924366499 0.0554773026 0.0944655453 0.3041343563 0.1046919552
## [241] 0.1936317844 0.3052829976 0.3644097647 0.0144145152 0.2078642557
## [246] 0.2334014141 0.1308524708 0.5075264509 0.5585203882 0.1063400872
## [251] 0.0182096255 0.1343598988 0.3763869504 0.1858552064 0.3051480184
## [256] 0.2499407076 0.1764422378 0.1341358432 0.4367564428 0.0973136152
## [261] 0.1845645683 -0.0197551864 0.1977286220 0.2277634648 0.4033131201
## [266] 0.2608377433 0.0891652255 0.1987313813 0.2843607788 0.4086302008
## [271] 0.0557489774 0.0844502459 0.1070084184 0.1949653745 0.0479754894
## [276] 0.1960007026 0.0544028836 0.1613061753 0.2653254548 0.3850871659
## [281] 0.0630404883 0.1327992291 0.0312025472 0.5634427937 0.1121044594
## [286] 0.2475003974 0.2878162049 0.2210966498 0.0247305616 0.4179597215
## [291] 0.1785562217 0.0557874060 0.2339030978 0.1231292890 0.0676302165
## [296] 0.3181896926 0.4377270478 0.0232980541 0.0316605627 0.2356710598
## [301] 0.6447967640 0.1469607892 0.1864727321 0.0420495325 0.1217105303
## [306] 0.4198279109 0.1930846304 0.6453356642 0.4453948596 0.2272534676
## [311] 0.2555551299 0.0903560194 0.0646704646 0.5053467312 0.2592608600
## [316] 0.4020183175 0.3000640275 0.7458402928 0.0897619464 0.2474094577
## [321] 0.5427007017 0.2876150711 0.2143168365 -0.0041707412 0.1844005276
## [326] 0.0926341460 0.2566742750 0.3969003253 0.4016559791 0.0148170950
## [331] 0.2363206135 0.1488048674 0.0553313377 0.0577220356 0.0920355135
## [336] 0.2296516956 0.5211923677 0.3042392236 0.2366508021 0.1491577637
## [341] 0.1605046982 0.0522991784 0.0420106974 0.1269672618 0.2682411782
## [346] 0.3979552415 0.0990870127 0.2008606764 0.0033276494 0.5398004211
## [351] 0.3077190620 0.0830648248 0.1674135047 0.4433401803 0.2246707448
## [356] 0.0774024180 0.4013785429 0.2634401112 -0.0007724021 0.2401818061
## [361] 0.2700382090 0.0098386143 0.1370769294 0.0614108253 0.0780512212
## [366] -0.0058465042 0.0144134348 0.0993921307 0.1934008487 0.2616313157
## [371] 0.0823037484 0.2758802551 0.1027501223 0.5152014743 0.2377389797
## [376] 0.1631006004 0.2980801329 0.0333643276 0.0245769741 0.2781861285
## [381] 0.3618580913 0.1774946164 0.2259599059 0.5303274154 0.2603019970
## [386] 0.4575556231 0.3687723842 0.3198960690 0.1556149093 0.1137579109
## [391] 0.0715189097 0.0530945821 0.0628233407 0.1889443591 0.0313787087
## [396] 0.0700015994 0.0373000242 0.1703089569 0.1324141402 0.0584845623
## [401] 0.3722915917 0.1324596395 0.2784955876 0.2303506567 0.2398224397
## [406] 0.0143304831 0.1193675138 0.0613240491 0.5342946235 0.2141984540
## [411] 0.3813439682 -0.0389484575 0.6287908017 0.1061597046 0.0237036033
## [416] 0.4346885389 0.0073361885 0.1143876266 0.1921423827 0.3083370069
## [421] 0.3157155966 0.2879546192 0.0322758963 0.1378929663 0.2566997238
## [426] 0.5031482378 0.5451388562 0.0131674517 0.0208843706 0.3201456647
## [431] 0.1918028429 0.4081394788 0.3321069097 0.2405460187 0.1244057660
## [436] 0.0630750414 0.2940231596 0.1415497141 0.1242023077 0.2773677955
## [441] 0.5129845646 0.5042148347 0.0198417849 0.2038218447 0.4046668379
## [446] 0.2321575159 0.1705484457 0.4346071829 0.4126313425 0.5207212974
## [451] 0.1290190690 0.1419834013 0.3464499890 0.0918739549 0.1801397306
## [456] 0.1050806504 0.1037864283 0.4827270573 0.0826790551 0.3212171067
## [461] 0.0875871935 -0.0169912507 0.0796132366 0.3223482610 0.0844822971
## [466] 0.0683803610 0.5676598277 0.2078135813 0.2353955276 0.0638769902
## [471] 0.4434760203 0.1912741982 0.0034685054 0.4871913718 0.2807007551
## [476] 0.1904795260 0.5197884787 0.0481477635 0.3363011675 0.1083048867
## [481] 0.5658682796 0.3912134694 0.2257445792 0.0048804348 0.1195444332
## [486] 0.0863810981 0.5991302995 0.4608903990 0.2264640960 0.1330355944
## [491] 0.5416833080 0.5514325010 0.0554357750 0.3709822074 0.3735295898
## [496] 0.0482784827 0.2987401973 0.1399053044 0.4672553308 0.0080654682
## [501] 0.4782980022 0.1436380920 0.2323510088 0.4212486632 0.0825132000
## [506] 0.3243257872 0.4293463247 0.1215823064 0.1603904834 0.0662628972
## [511] 0.0566003361 0.3762516787 0.1464655639 0.0732313655 0.2598614151
## [516] 0.5795170438 0.2136955152 0.3914829509 0.1741895138 0.5124184002
## [521] 0.1239881661 0.1485055115 0.0412428732 0.4725326705 0.2224613131
## [526] 0.1019409261 0.3390401759 0.5540213273 0.3055388577 0.2769173656
## [531] 0.0405204643 0.2865107108 0.2132127886 0.1281593244 0.0363793699
## [536] 0.1003308967 0.6724537800 0.4035266707 0.2542470477 0.4788257740
## [541] 0.2029103524 0.6832358724 0.0790981055 0.1813502617 0.2376983393
## [546] 0.0849069008 0.2231191857 0.3144856394 0.1650242244 0.2992959201
## [551] 0.1323941221 0.1798158435 0.2359940543 0.0658562688 0.0867287315
## [556] 0.1873501292 0.3487606603 0.0994161787 0.2112989519 0.1710237542
## [561] 0.0978060347 0.3027364421 0.1299602054 0.5774249267 0.2227057837
## [566] 0.2462094026 0.4642187439 0.2950640445 0.1796621541 0.3843816428
## [571] 0.0626102163 0.0656780350 0.0919567097 0.0290284649 0.2374671960
## [576] 0.2155007815 0.0934027529 0.1986447380 0.5426562451 0.3225957696
## [581] 0.2614802838 0.1785278402 0.1798598739 0.3496668304 0.0388582929
## [586] 0.0417841843 0.1230392633 0.4014595142 0.0426371837 0.4019187729
## [591] 0.2311271800 0.1663099875 0.5155289271 0.0915050465 0.4240997755
## [596] 0.0699129828 0.1373525919 0.2727090330 0.1522864310 0.0676447542
## [601] 0.2237493900 0.3152354464 0.3137111595 0.1790765208 0.0454784599
## [606] 0.1526004668 0.3674611780 0.2393688577 0.0113056871 0.1221146459
## [611] 0.0955806912 0.1309411307 0.2494746448 0.0320134209 0.2291831699
## [616] 0.2919884331 0.5245990330 0.1045488578 0.2196790544 0.0020445651
## [621] 0.2696640269 0.2981599461 0.0781168390 0.0879482646 0.3261985674
## [626] 0.1631398959 0.0761410282 0.2993509777 0.2182627918 0.5900770685
## [631] 0.4012131951 0.6431760922 0.0709717584 0.3397909515 0.1046855850
## [636] 0.1381535636 0.0837259969 0.0718952331 0.3987253088 0.2336953908
## [641] 0.2927413717 0.0939959898 0.3594004123 0.2254516582 0.0229006061
## [646] 0.2675594967 0.1728135743 0.4087116802 0.2371036213 0.3294549896
## [651] 0.0530993975 0.2128387416 0.1400643142 0.0470845004 0.0796131014
## [656] 0.0843203687 0.0380190442 0.3323814378 0.4068764790 0.2604723952
## [661] 0.1419926016 0.4121031913 0.0199957961 0.2532287086 0.5708178820
## [666] 0.2017649189 0.1356754448 0.2137891553 0.3153278757 0.3978166530
## [671] 0.1005276326 0.0540290959 0.2702925651 -0.0149592889 0.3745533110
## [676] 0.0854935589 0.0222020494 0.2206530778 0.1457124859 0.2544434743
## [681] 0.3192285048 0.3232566515 0.0391178207 0.1328095065 0.1817039516
## [686] 0.3997006235 0.1198546749 0.3515366585 0.1405594762 0.3226232675
## [691] 0.1173185439 0.2996989158 0.0419287557 0.8201829294 0.0844963802
## [696] 0.0404449217 0.2410453271 0.1385576219 0.0151777550 0.0362605688
## [701] 0.1559252436 0.2290648691 0.0640742400 0.6975932982 0.3110713403
## [706] 0.5417379232 0.1917444523 0.2682852481 0.2678136175 0.0807150683
## [711] 0.4762566756 0.3115231455 0.2500932847 0.3111373089 0.4947133736
## [716] 0.2141609809 0.3915933466 0.9466201536 0.5280201306 0.0350040023
## [721] 0.4786463045 0.0444127106 0.1433780575 0.1044507622 0.2103058240
## [726] 0.0514541419 0.7529642073 0.3311974950 0.5820787648 0.3387319322
## [731] 0.3069890281 0.1817459568 0.1411888082 0.4344983315 0.1749361760
## [736] 0.2825424331 0.0143664032 0.0387139175 0.4191322254 0.3099293263
## [741] 0.0122147599 0.1184927901 0.1872122037 0.3251889191 0.3480610003
## [746] 0.0919776528 0.5775153780 0.1112072613 0.4737946628 0.2200985943
## [751] 0.3447206292 0.5302459822 0.4641708952 0.2919630391 0.1511507103
## [756] 0.1241559956 0.1218278853 0.2600617556 0.2208417397 0.0971961808
## [761] 0.0417653957 0.1075625724 0.6563288637 0.3376873255 0.1239416432
## [766] 0.1665500950 0.0453673577 0.2546715084 0.3815307602 0.3841530940
## [771] 0.5421933530 0.1522005217 0.3345982528 0.3365900681 0.2882648456
## [776] 0.8213235814 0.7881302226 0.2545480448 0.0957923767 0.4238654678
## [781] 0.1260113504 0.1264668125 0.1096841534 0.0294087084 0.3822485061
## [786] 0.0828366863 0.1555816237 0.0324773834 0.2043093203 0.3551925395
## [791] 0.5113723599 0.2965136942 0.1786926106 0.8370494134 0.2037242657
## [796] 0.0796954009 0.7885382076 0.4817127334 0.1912323214 0.3186047471
## [801] 0.1730460698 0.2657430621 0.1657608494 0.1559138488 0.7417171563
## [806] 0.7064872703 0.1224696925 0.2192468046 0.6511514153 0.0322655135
## [811] 0.1227532759 0.1612151807 0.3009524507 0.0880247122 0.1046919552
## [816] 0.1453489316 0.4074345782 0.3644097647 0.4682137153 0.1861190639
## [821] 0.2334014141 0.8265211373 0.3905325686 0.1870064806 0.0395631953
## [826] -0.0059291212 0.1324835687 0.2733459155 0.0662892754 0.1567532194
## [831] 0.3195232577 0.3276688149 0.1824060954 0.6859853581 0.3051480184
## [836] 0.2516754465 0.1018390610 0.1341358432 0.0953975928 0.6125567477
## [841] 0.0973136152 0.5438283775 0.1353562622 0.0515667770 0.0593339287
## [846] 0.1300424202 0.1818416043 0.2998613357 0.0323651479 0.1987313813
## [851] 0.2535170721 0.0899245227 0.4783473373 0.4566790039 0.4086302008
## [856] 0.0557489774 0.1449744894 0.1566643255 0.0798451640 0.3654225454
## [861] 0.1949653745 0.0479754894 0.0090304145 0.1613061753 0.1455932183
## [866] 0.3850871659 0.1327992291 0.0681990396 0.3679246665 0.5774786688
## [871] 0.0312025472 0.6638401556 0.1121044594 0.2657715658 0.2475003974
## [876] 0.2878162049 0.4179597215 0.1951023890 0.0328216603 0.1264353401
## [881] 0.4610250233 0.0676302165 0.1865246215 0.4799839336 0.0506646077
## [886] 0.1224290989 0.0849733285 0.0908536746 0.0315805919 0.0420495325
## [891] 0.1582325867 0.2151444653 0.0831963897 0.1295088510 0.3297232774
## [896] 0.1330978651 0.2395036527 0.0905795702 0.4646260372 0.2555551299
## [901] 0.2740781616 0.5464084553 0.0124037050 0.1754829958 0.0235861285
## [906] 0.0756822182 0.0295486897 0.0897619464 0.2471117637 0.3212570278
## [911] 0.5759229975 0.3616064797 0.0572640402 0.1124111165 0.2934981351
## [916] 0.2782775451 0.4315518511 0.1425013949 0.1257227421 0.1844005276
## [921] 0.2983512747 0.0104702772 0.2797949764 0.1488048674 0.0920355135
## [926] 0.1545328952 0.2791166462 0.0753886732 0.1810004025 0.0933343056
## [931] 0.0433184893 0.3042392236 0.0522991784 0.0420106974 0.2303541778
## [936] 0.2182122081 0.1336507939 0.0990870127 0.2946345411 0.5800524808
## [941] 0.0797759169 0.5398004211 0.1545460891 0.1177699565 0.2060130012
## [946] 0.8000745592 0.3730611767 0.0297629595 0.3433650862 0.4433401803
## [951] 0.2663341689 0.0665190749 0.0615590076 0.0055733783 0.0745237563
## [956] 0.3087753671 0.5647669107 0.0206777769 0.1575714731 0.4105309903
## [961] 0.0780512212 0.1060556398 0.4185156737 0.0512461878 0.2742669239
## [966] 0.1654669473 0.0718753641 0.0197811247 0.3484296951 0.2418402030
## [971] 0.9334025992 0.2377389797 0.3045897156 0.0319542688 0.3055747452
## [976] 0.6563036320 0.7050489159 0.2781861285 0.3618580913 0.0789962940
## [981] 0.1460169625 0.1253071125 0.3518574244 0.3514983893 0.0346459836
## [986] 0.3052371771 0.3391272465 0.1154535305 0.3729714163 0.4887477022
## [991] 0.5715008257 0.0715189097 0.0530945821 0.3632962005 0.0975164514
## [996] 0.1889443591 0.0654536098 0.2387735069 0.0997226848 0.1324141402
## [1001] 0.1491367435 0.2529002743 0.0513740802 0.0143304831 0.2025549750
## [1006] 0.0613240491 0.3196153113 0.1248726681 0.5843439810 0.0650210088
## [1011] 0.0432874638 0.2898759322 0.3996350751 0.6332030997 0.3412460240
## [1016] 0.4346885389 0.1833081411 0.4076900417 0.0821985754 0.1143876266
## [1021] 0.1837287425 0.0548118905 0.3778906464 0.0875356807 0.3157155966
## [1026] 0.4887531777 0.0616432585 0.2330606627 0.2566997238 0.2096893330
## [1031] 0.4459638514 0.1575473653 0.2756395414 0.0468350697 0.0641222234
## [1036] 0.3274994947 0.2711466340 0.0288918054 0.4697714831 0.3321069097
## [1041] 0.8539435070 0.1244057660 0.7911102551 0.2006513024 0.0766123917
## [1046] 0.3821446374 0.5423773029 0.5068893693 0.2038218447 0.0090226102
## [1051] 0.1169037061 0.2495169227 0.1003841189 0.3068825029 0.1219338321
## [1056] 0.4170436404 0.5141983686 0.1146504376 0.0918739549 0.0975314880
## [1061] 0.1050806504 0.2931192079 0.1737944669 0.0486310449 0.0393583907
## [1066] 0.4827270573 0.2775830276 0.2427191022 0.2389811694 0.0796132366
## [1071] 0.3223482610 0.0844822971 0.5988667380 0.2078135813 0.2511815220
## [1076] 0.2798116341 0.1430251561 0.3905951161 0.0569583048 0.3005063363
## [1081] 0.1083048867 0.7013572698 0.1128689524 0.3439401633 0.0048804348
## [1086] -0.0197448583 0.6416627896 0.6759499926 0.1728234563 0.0459195373
## [1091] 0.2264640960 0.0304273523 0.1330355944 0.0686542338 0.1610076327
## [1096] 0.5987206950 0.2197431527 0.3830016934 0.2276727503 0.0876982101
## [1101] 0.0813622411 0.4843527692 0.0367085566 0.1462900941 0.4212486632
## [1106] 0.0039405857 0.3221246590 0.1204908367 0.0631951156 -0.0062551549
## [1111] 0.0657170259 0.5080309805 0.4268972368 0.3502252094 0.2550536207
## [1116] 0.0912200074 0.1741895138 0.0080003925 0.6315964463 0.4059195934
## [1121] 0.4879779448 0.0413627975 0.2432439696 0.4479824960 0.6454804583
## [1126] 0.3055388577 0.0593053925 0.0405204643 0.2429783782 0.0471484724
## [1131] 0.0626286112 0.2144512648 0.1003308967 0.7264279909 0.4461788842
## [1136] 0.3581550137 0.0790981055 0.7455276412 0.2879377608 0.0211275610
## [1141] 0.0569399981 0.6522593673 0.0427567141 0.1392799007 0.2992959201
## [1146] 0.5988967979 0.7538561769 0.0658562688 0.0687731136 0.1710237542
## [1151] 0.1169782483 0.1895821982 0.1681123160 0.3027364421 0.1299602054
## [1156] 0.1176579771 0.1221066804 0.6510377867 0.1281656391 0.2389977110
## [1161] 0.5139111161 0.2198040332 0.3939972316 0.0170089790 0.0614087039
## [1166] 0.2551753959 0.1751490036 0.1396232336 0.3412700870 0.1986447380
## [1171] 0.0249513309 0.6069898225 0.4152720151 0.1785278402 0.0552466720
## [1176] 0.0587964402 0.2506116193 0.0757989306 -0.0076337076 0.0049113171
## [1181] 0.1230392633 0.1420917793
# Convert the test set to a data frame
test_set <- test %>% as.data.frame()
# Calculate residuals (difference between observed and predicted values)
residuals <- test_set$ViolentCrimesPerPop - y_pred$predict
# Calculate Root Mean Squared Error (RMSE)
RMSE = sqrt(mean(residuals^2))
RMSE
## [1] 0.1287954
# Calculate mean of the observed values in the test set
y_test_mean = mean(test_set$ViolentCrimesPerPop)
# Calculate Total Sum of Squares (TSS) and Residual Sum of Squares (RSS)
tss = sum((test_set$ViolentCrimesPerPop - y_test_mean)^2)
rss = sum(residuals^2)
# Calculate R-squared (coefficient of determination)
R2 = 1 - (rss/tss)
R2
## [1] 0.6644794
# Obtain sample size and number of independent variables
n <- test_set %>% nrow() # sample size
k <- features %>% length() # number of independent variables
# Calculate Adjusted R-squared
Adjusted_R2 = 1 - (1 - R2) * ((n - 1) / (n - k - 1))
# Create a tibble with RMSE, R2, and Adjusted R2
tibble(RMSE = round(RMSE, 1),
R2, Adjusted_R2)
## # A tibble: 1 × 3
## RMSE R2 Adjusted_R2
## <dbl> <dbl> <dbl>
## 1 0.1 0.664 0.661
# Combine predicted and observed values into a data frame
my_data <- cbind(predicted = y_pred$predict,
observed = test_set$ViolentCrimesPerPop) %>% as.data.frame()
# Create a scatter plot with a regression line
g <- my_data %>%
ggplot(aes(predicted, observed)) +
geom_point(color = "red") +
geom_smooth(method = lm) +
labs(x = "Predicted Crime rate",
y = "Observed Crime rate",
title = glue('Test: Adjusted R2 = {round(enexpr(Adjusted_R2), 2)}')) +
theme(plot.title = element_text(color = "darkgreen", size = 16, hjust = 0.5),
axis.text.y = element_text(size = 12),
axis.text.x = element_text(size = 12),
axis.title.x = element_text(size = 14),
axis.title.y = element_text(size = 14))
# Convert ggplot to plotly
g %>% ggplotly()
## `geom_smooth()` using formula = 'y ~ x'
# Make predictions on the train set.
y_pred_train <- model %>% h2o.predict(newdata = train) %>% as.data.frame()
##
|
| | 0%
|
|======================================================================| 100%
# Extract the train set as a data frame.
train_set <- train %>% as.data.frame()
# Calculate residuals and Root Mean Squared Error (RMSE) for the train set.
residuals <- train_set$ViolentCrimesPerPop - y_pred_train$predict
RMSE_train = sqrt(mean(residuals^2))
# Calculate the mean of the target variable in the train set.
y_train_mean = mean(train_set$ViolentCrimesPerPop)
# Calculate Total Sum of Squares (tss) and Residual Sum of Squares (rss).
tss = sum((train_set$ViolentCrimesPerPop - y_train_mean)^2)
rss = sum(residuals^2)
# Calculate R-squared (R2) for the train set.
R2_train = 1 - (rss/tss)
# Calculate Adjusted R-squared for the train set.
n <- train_set %>% nrow()
k <- features %>% length()
Adjusted_R2_train = 1 - (1 - R2_train) * ((n - 1) / (n - k - 1))
# Create a data frame with predicted and observed values for the train set.
my_data_train <- cbind(predicted = y_pred_train$predict, observed = train_set$ViolentCrimesPerPop) %>%
as.data.frame()
# Create a scatter plot with regression line for the train set.
g_train <- my_data_train %>%
ggplot(aes(predicted, observed)) +
geom_point(color = "darkred") +
geom_smooth(method = lm) +
labs(x = "Predicted Crime rate",
y = "Observed Crime rate",
title = glue('Train Set Evaluation: Adjusted R2 = {round(Adjusted_R2_train, 2)}')) +
theme(plot.title = element_text(color = "darkgreen", size = 16, hjust = 0.5),
axis.text.y = element_text(size = 12),
axis.text.x = element_text(size = 12),
axis.title.x = element_text(size = 14),
axis.title.y = element_text(size = 14))
# Convert the ggplot to plotly for interactive visualization.
g_train %>% ggplotly()
## `geom_smooth()` using formula = 'y ~ x'
# Combine the train set plot with the previous plot (g) using patchwork.
library(patchwork)
g_train + g
## `geom_smooth()` using formula = 'y ~ x'
## `geom_smooth()` using formula = 'y ~ x'
## Final Summary
# Create a tibble with summary metrics for the train and test sets.
summary_metrics <- tibble(
RMSE_train = round(RMSE_train, 1),
RMSE_test = round(RMSE, 1),
Adjusted_R2_train,
Adjusted_R2_test = Adjusted_R2
)
# Print the final summary metrics.
summary_metrics
## # A tibble: 1 × 4
## RMSE_train RMSE_test Adjusted_R2_train Adjusted_R2_test
## <dbl> <dbl> <dbl> <dbl>
## 1 0.1 0.1 0.626 0.661